{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to DataFrames\n",
    "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using DataFrames # load package"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split-apply-combine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>id2</th><th>v</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>0.305809</td></tr><tr><th>2</th><td>2</td><td>2</td><td>0.391227</td></tr><tr><th>3</th><td>3</td><td>1</td><td>0.256948</td></tr><tr><th>4</th><td>4</td><td>2</td><td>0.201459</td></tr><tr><th>5</th><td>1</td><td>1</td><td>0.489054</td></tr><tr><th>6</th><td>2</td><td>2</td><td>0.620628</td></tr><tr><th>7</th><td>3</td><td>1</td><td>0.470056</td></tr><tr><th>8</th><td>4</td><td>2</td><td>0.674783</td></tr></tbody></table>"
      ],
      "text/plain": [
       "8×3 DataFrames.DataFrame\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 1  │ 1   │ 0.305809 │\n",
       "│ 2   │ 2  │ 2   │ 0.391227 │\n",
       "│ 3   │ 3  │ 1   │ 0.256948 │\n",
       "│ 4   │ 4  │ 2   │ 0.201459 │\n",
       "│ 5   │ 1  │ 1   │ 0.489054 │\n",
       "│ 6   │ 2  │ 2   │ 0.620628 │\n",
       "│ 7   │ 3  │ 1   │ 0.470056 │\n",
       "│ 8   │ 4  │ 2   │ 0.674783 │"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrames.GroupedDataFrame  4 groups with keys: Symbol[:id]\n",
       "First Group:\n",
       "2×3 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 1  │ 1   │ 0.305809 │\n",
       "│ 2   │ 1  │ 1   │ 0.489054 │\n",
       "⋮\n",
       "Last Group:\n",
       "2×3 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 4  │ 2   │ 0.201459 │\n",
       "│ 2   │ 4  │ 2   │ 0.674783 │"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gx1 = groupby(x, :id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrames.GroupedDataFrame  4 groups with keys: Symbol[:id, :id2]\n",
       "First Group:\n",
       "2×3 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 1  │ 1   │ 0.305809 │\n",
       "│ 2   │ 1  │ 1   │ 0.489054 │\n",
       "⋮\n",
       "Last Group:\n",
       "2×3 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 4  │ 2   │ 0.201459 │\n",
       "│ 2   │ 4  │ 2   │ 0.674783 │"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gx2 = groupby(x, [:id, :id2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>id2</th><th>v</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>0.305809</td></tr><tr><th>2</th><td>1</td><td>1</td><td>0.489054</td></tr><tr><th>3</th><td>2</td><td>2</td><td>0.391227</td></tr><tr><th>4</th><td>2</td><td>2</td><td>0.620628</td></tr><tr><th>5</th><td>3</td><td>1</td><td>0.256948</td></tr><tr><th>6</th><td>3</td><td>1</td><td>0.470056</td></tr><tr><th>7</th><td>4</td><td>2</td><td>0.201459</td></tr><tr><th>8</th><td>4</td><td>2</td><td>0.674783</td></tr></tbody></table>"
      ],
      "text/plain": [
       "8×3 DataFrames.DataFrame\n",
       "│ Row │ id │ id2 │ v        │\n",
       "├─────┼────┼─────┼──────────┤\n",
       "│ 1   │ 1  │ 1   │ 0.305809 │\n",
       "│ 2   │ 1  │ 1   │ 0.489054 │\n",
       "│ 3   │ 2  │ 2   │ 0.391227 │\n",
       "│ 4   │ 2  │ 2   │ 0.620628 │\n",
       "│ 5   │ 3  │ 1   │ 0.256948 │\n",
       "│ 6   │ 3  │ 1   │ 0.470056 │\n",
       "│ 7   │ 4  │ 2   │ 0.201459 │\n",
       "│ 8   │ 4  │ 2   │ 0.674783 │"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vcat(gx2...) # back to the original DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>x</th></tr></thead><tbody><tr><th>1</th><td>missing</td><td>1</td></tr><tr><th>2</th><td>5</td><td>2</td></tr><tr><th>3</th><td>1</td><td>3</td></tr><tr><th>4</th><td>3</td><td>4</td></tr><tr><th>5</th><td>missing</td><td>5</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×2 DataFrames.DataFrame\n",
       "│ Row │ id      │ x │\n",
       "├─────┼─────────┼───┤\n",
       "│ 1   │ \u001b[90mmissing\u001b[39m │ 1 │\n",
       "│ 2   │ 5       │ 2 │\n",
       "│ 3   │ 1       │ 3 │\n",
       "│ 4   │ 3       │ 4 │\n",
       "│ 5   │ \u001b[90mmissing\u001b[39m │ 5 │"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrames.GroupedDataFrame  4 groups with keys: Symbol[:id]\n",
      "gd[1]:\n",
      "2×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id      │ x │\n",
      "├─────┼─────────┼───┤\n",
      "│ 1   │ \u001b[90mmissing\u001b[39m │ 1 │\n",
      "│ 2   │ \u001b[90mmissing\u001b[39m │ 5 │gd[2]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 5  │ 2 │gd[3]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 1  │ 3 │gd[4]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 3  │ 4 │"
     ]
    }
   ],
   "source": [
    "showall(groupby(x, :id)) # by default groups include mising values and are not sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrames.GroupedDataFrame  3 groups with keys: Symbol[:id]\n",
      "gd[1]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 1  │ 3 │gd[2]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 3  │ 4 │gd[3]:\n",
      "1×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
      "│ Row │ id │ x │\n",
      "├─────┼────┼───┤\n",
      "│ 1   │ 5  │ 2 │"
     ]
    }
   ],
   "source": [
    "showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>x1</th></tr></thead><tbody><tr><th>1</th><td>'a'</td><td>0.523672</td></tr><tr><th>2</th><td>'b'</td><td>0.537341</td></tr><tr><th>3</th><td>'d'</td><td>0.458676</td></tr><tr><th>4</th><td>'c'</td><td>0.499453</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×2 DataFrames.DataFrame\n",
       "│ Row │ id  │ x1       │\n",
       "├─────┼─────┼──────────┤\n",
       "│ 1   │ 'a' │ 0.523672 │\n",
       "│ 2   │ 'b' │ 0.537341 │\n",
       "│ 3   │ 'd' │ 0.458676 │\n",
       "│ 4   │ 'c' │ 0.499453 │"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id=rand('a':'d', 100), v=rand(100));\n",
    "by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>x1</th></tr></thead><tbody><tr><th>1</th><td>'a'</td><td>0.523672</td></tr><tr><th>2</th><td>'b'</td><td>0.537341</td></tr><tr><th>3</th><td>'c'</td><td>0.499453</td></tr><tr><th>4</th><td>'d'</td><td>0.458676</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×2 DataFrames.DataFrame\n",
       "│ Row │ id  │ x1       │\n",
       "├─────┼─────┼──────────┤\n",
       "│ 1   │ 'a' │ 0.523672 │\n",
       "│ 2   │ 'b' │ 0.537341 │\n",
       "│ 3   │ 'c' │ 0.499453 │\n",
       "│ 4   │ 'd' │ 0.458676 │"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>res</th></tr></thead><tbody><tr><th>1</th><td>'a'</td><td>0.523672</td></tr><tr><th>2</th><td>'b'</td><td>0.537341</td></tr><tr><th>3</th><td>'d'</td><td>0.458676</td></tr><tr><th>4</th><td>'c'</td><td>0.499453</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×2 DataFrames.DataFrame\n",
       "│ Row │ id  │ res      │\n",
       "├─────┼─────┼──────────┤\n",
       "│ 1   │ 'a' │ 0.523672 │\n",
       "│ 2   │ 'b' │ 0.537341 │\n",
       "│ 3   │ 'd' │ 0.458676 │\n",
       "│ 4   │ 'c' │ 0.499453 │"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>x1_sum</th><th>x2_sum</th></tr></thead><tbody><tr><th>1</th><td>'c'</td><td>14.7061</td><td>11.7598</td></tr><tr><th>2</th><td>'b'</td><td>11.3694</td><td>9.21505</td></tr><tr><th>3</th><td>'a'</td><td>14.5423</td><td>12.0944</td></tr><tr><th>4</th><td>'d'</td><td>10.9314</td><td>10.3805</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×3 DataFrames.DataFrame\n",
       "│ Row │ id  │ x1_sum  │ x2_sum  │\n",
       "├─────┼─────┼─────────┼─────────┤\n",
       "│ 1   │ 'c' │ 14.7061 │ 11.7598 │\n",
       "│ 2   │ 'b' │ 11.3694 │ 9.21505 │\n",
       "│ 3   │ 'a' │ 14.5423 │ 12.0944 │\n",
       "│ 4   │ 'd' │ 10.9314 │ 10.3805 │"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100))\n",
    "aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>id</th><th>x1_sum</th><th>x2_sum</th></tr></thead><tbody><tr><th>1</th><td>'a'</td><td>14.5423</td><td>12.0944</td></tr><tr><th>2</th><td>'b'</td><td>11.3694</td><td>9.21505</td></tr><tr><th>3</th><td>'c'</td><td>14.7061</td><td>11.7598</td></tr><tr><th>4</th><td>'d'</td><td>10.9314</td><td>10.3805</td></tr></tbody></table>"
      ],
      "text/plain": [
       "4×3 DataFrames.DataFrame\n",
       "│ Row │ id  │ x1_sum  │ x2_sum  │\n",
       "├─────┼─────┼─────────┼─────────┤\n",
       "│ 1   │ 'a' │ 14.5423 │ 12.0944 │\n",
       "│ 2   │ 'b' │ 11.3694 │ 9.21505 │\n",
       "│ 3   │ 'c' │ 14.7061 │ 11.7598 │\n",
       "│ 4   │ 'd' │ 10.9314 │ 10.3805 │"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "aggregate(x, :id, sum, sort=true) # also can be sorted"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*We omit the discussion of of map/combine as I do not find them very useful (better to use by)*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x2</th><th>x3</th><th>x4</th><th>x5</th></tr></thead><tbody><tr><th>1</th><td>0.945035</td><td>0.0997115</td><td>0.806288</td><td>0.30923</td><td>0.271485</td></tr><tr><th>2</th><td>0.210335</td><td>0.594834</td><td>0.895877</td><td>0.442921</td><td>0.634513</td></tr><tr><th>3</th><td>0.289443</td><td>0.350774</td><td>0.441061</td><td>0.35858</td><td>0.889212</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×5 DataFrames.DataFrame\n",
       "│ Row │ x1       │ x2        │ x3       │ x4       │ x5       │\n",
       "├─────┼──────────┼───────────┼──────────┼──────────┼──────────┤\n",
       "│ 1   │ 0.945035 │ 0.0997115 │ 0.806288 │ 0.30923  │ 0.271485 │\n",
       "│ 2   │ 0.210335 │ 0.594834  │ 0.895877 │ 0.442921 │ 0.634513 │\n",
       "│ 3   │ 0.289443 │ 0.350774  │ 0.441061 │ 0.35858  │ 0.889212 │"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(rand(3, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x2</th><th>x3</th><th>x4</th><th>x5</th></tr></thead><tbody><tr><th>1</th><td>0.481604</td><td>0.34844</td><td>0.714409</td><td>0.370244</td><td>0.598403</td></tr></tbody></table>"
      ],
      "text/plain": [
       "1×5 DataFrames.DataFrame\n",
       "│ Row │ x1       │ x2      │ x3       │ x4       │ x5       │\n",
       "├─────┼──────────┼─────────┼──────────┼──────────┼──────────┤\n",
       "│ 1   │ 0.481604 │ 0.34844 │ 0.714409 │ 0.370244 │ 0.598403 │"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map(mean, eachcol(x)) # map a function over each column and return a data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x1: 0.4816043926633542\n",
      "x2: 0.34843991152206805\n",
      "x3: 0.7144087007367234\n",
      "x4: 0.3702439522046452\n",
      "x5: 0.5984033693699742\n"
     ]
    }
   ],
   "source": [
    "foreach(c -> println(c[1], \": \", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Float64,1}:\n",
       " 0.481604\n",
       " 0.34844 \n",
       " 0.714409\n",
       " 0.370244\n",
       " 0.598403"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "colwise(mean, x) # colwise is similar, but produces a vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2-element Array{Array{Float64,1},1}:\n",
       " [0.577685, 0.347273, 0.851083, 0.376076, 0.452999, 1.0]\n",
       " [0.289443, 0.350774, 0.441061, 0.35858, 0.889212, 2.0] "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[:id] = [1,1,2]\n",
    "colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3-element Array{Float64,1}:\n",
       " 9.47769 \n",
       " 0.353603\n",
       " 0.825155"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
